import ast
import pandas as pd
from collections import defaultdict

def sample_passage(p, fname):
    """
    Write one passage from the given path+filename.
    """
    with open(p+fname, 'r') as FILE:
        txt = FILE.read()
    random_5s = utils.sample_5_sents(txt, N_SAMPLES=5)
    
    written = False
    for i, text in enumerate(random_5s):
        if len(text) > 1500 or text[0] == "'" or len(text) < 200:
            continue
        else:
            with open('/media/secure_volume/detecting-narrativity/'+category+'/'+category+'_5S_'+re.sub(r'\W+', '-', fname[:-4])+'.txt', 'w') as F:
                F.write(text)
            written = True
    return written


def main():
    id_year_pages = tuple(metadata[['htid', 'year', 'page_numbers_str']].values)
    assert len(id_year_pages) == metadata.shape[0]

    THRESHOLD = 500

    map_year_fnames = defaultdict(list)
    for htid, year, page_str in id_year_pages:
        page_numbers = ast.literal_eval(page_str)
        htid = htid.replace(",", ".").replace("+", ":").replace("=", "/")
        if len(map_year_fnames[year]) >= THRESHOLD:
            continue
        for pg in page_numbers:
            fname = htid+ '____PAGE____'+ pg[:-4]+'_clean.txt'
            written = sample_passage('path-to-txt-fname', fname)
            if written:
                map_year_fnames[year].append(fname)
                break

    map_year_count = defaultdict(int)
    for y in map_year_fnames:
        map_year_count[y] = len(map_year_fnames[y])
    for y in sorted(map_year_fnames):
        assert len(map_year_fnames[y]) == map_year_count[y]
        if len(map_year_fnames[y]) < THRESHOLD:
            print(y, "--", len(map_year_fnames[y]))
    count = 0
    for y in map_year_count:
        count += map_year_count[y]
    print("\n\n\n--------------\n\nTotal volumes for {} = {}".format(path.split('/')[-1], count))
    
    
if __name__ == '__main__':
#     path = '/Users/sunyambagga/Desktop/txtLAB-2/Million-Page-Dataset/FINAL-DATASET/NON_metadata.tsv'
    path = '/Users/sunyambagga/Desktop/txtLAB-2/Million-Page-Dataset/FINAL-DATASET/FICTION_metadata.tsv'
    if 'FICTION_metadata' in path:
        category = 'FIC'
    elif 'NON_metadata' in path:
        category = 'NON'
    metadata = pd.read_csv(path, delimiter='\t')
    print("Unique volumes:", metadata.shape[0])
    main()